In [ ]:
 
In [1]:
%matplotlib inline

1. Importing Libraries¶

In [2]:
# Data preprocessing libraries
import numpy as np
import pandas as pd
from pandas.plotting import parallel_coordinates

import os
import sqlite3
import math
from collections import Counter
from pathlib import Path
from tqdm import tqdm

# Visualization
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

# Model

from sklearn.decomposition import PCA 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Config
mpl.rcParams['font.family'] = 'monospace' 
sns.set_theme(style="white", palette=None)
plotly.offline.init_notebook_mode() 
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

2. Loading data¶

In [3]:
# Reading csv files and drop the first column
df_train = pd.read_csv(r"data/fraudTrain.csv")
df_train.drop(df_train.columns[0], axis=1, inplace=True)

df_test = pd.read_csv(r"data/fraudTest.csv")
df_test.drop(df_test.columns[0], axis=1, inplace=True)
In [4]:
df = pd.concat([df_train, df_test], axis =0).reset_index()
df.shape
Out[4]:
(1852394, 23)
In [5]:
# delete df_train and df_test to save memory
print('Deleting df_train and df_test')
del([df_train, df_test])
Deleting df_train and df_test
In [6]:
df.tail(3)
Out[6]:
index trans_date_trans_time cc_num merchant category amt first last gender street ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
1852391 555716 2020-12-31 23:59:15 6011724471098086 fraud_Rau-Robel kids_pets 86.88 Ann Lawson F 144 Evans Islands Apt. 683 ... 46.1966 -118.9017 3684 Musician 1981-11-29 6c5b7c8add471975aa0fec023b2e8408 1388534355 46.658340 -119.715054 0
1852392 555717 2020-12-31 23:59:24 4079773899158 fraud_Breitenberg LLC travel 7.99 Eric Preston M 7020 Doyle Stream Apt. 951 ... 44.6255 -116.4493 129 Cartographer 1965-12-15 14392d723bb7737606b2700ac791b7aa 1388534364 44.470525 -117.080888 0
1852393 555718 2020-12-31 23:59:34 4170689372027579 fraud_Dare-Marvin entertainment 38.13 Samuel Frey M 830 Myers Plaza Apt. 384 ... 35.6665 -97.4798 116001 Media buyer 1993-05-10 1765bb45b3aa3224b4cdcb6e7a96cee3 1388534374 36.210097 -97.036372 0

3 rows × 23 columns

3. Preprocessing¶

3.1 Rename some columns¶

In [7]:
df.columns
Out[7]:
Index(['index', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')
In [8]:
df.rename(columns={"trans_date_trans_time":"transaction_time",
                         "cc_num":"credit_card_number",
                         "amt":"amount(usd)",
                         "trans_num":"transaction_id"},
                inplace=True)

3.2 Convert time columns into dataframe time¶

Transaction_time and dob should be in pd.datetime format

In [9]:
df["transaction_time"] = pd.to_datetime(df["transaction_time"], infer_datetime_format=True)
df["dob"] = pd.to_datetime(df["dob"], infer_datetime_format=True)
In [10]:
from datetime import datetime

# Apply function utcfromtimestamp and drop column unix_time
df['time'] = df['unix_time'].apply(datetime.utcfromtimestamp)
df.drop('unix_time', axis=1)

# Add cloumn hour of day
df['hour_of_day'] = df.time.dt.hour
In [11]:
df[['time','hour_of_day']]
Out[11]:
time hour_of_day
0 2012-01-01 00:00:18 0
1 2012-01-01 00:00:44 0
2 2012-01-01 00:00:51 0
3 2012-01-01 00:01:16 0
4 2012-01-01 00:03:06 0
... ... ...
1852389 2013-12-31 23:59:07 23
1852390 2013-12-31 23:59:09 23
1852391 2013-12-31 23:59:15 23
1852392 2013-12-31 23:59:24 23
1852393 2013-12-31 23:59:34 23

1852394 rows × 2 columns

3.3 convert data types¶

In [12]:
# Change dtypes
df.credit_card_number = df.credit_card_number.astype('category')
df.is_fraud = df.is_fraud.astype('category')
df.hour_of_day = df.hour_of_day.astype('category')

# Check
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 25 columns):
 #   Column              Dtype         
---  ------              -----         
 0   index               int64         
 1   transaction_time    datetime64[ns]
 2   credit_card_number  category      
 3   merchant            object        
 4   category            object        
 5   amount(usd)         float64       
 6   first               object        
 7   last                object        
 8   gender              object        
 9   street              object        
 10  city                object        
 11  state               object        
 12  zip                 int64         
 13  lat                 float64       
 14  long                float64       
 15  city_pop            int64         
 16  job                 object        
 17  dob                 datetime64[ns]
 18  transaction_id      object        
 19  unix_time           int64         
 20  merch_lat           float64       
 21  merch_long          float64       
 22  is_fraud            category      
 23  time                datetime64[ns]
 24  hour_of_day         category      
dtypes: category(3), datetime64[ns](3), float64(5), int64(4), object(10)
memory usage: 318.0+ MB

4. Exploratory data analysis (EDA)¶

In [13]:
np.round(df.describe(), 2)
Out[13]:
index amount(usd) zip lat long city_pop unix_time merch_lat merch_long
count 1852394.00 1852394.00 1852394.00 1852394.00 1852394.00 1852394.00 1.852394e+06 1852394.00 1852394.00
mean 537193.44 70.06 48813.26 38.54 -90.23 88643.67 1.358674e+09 38.54 -90.23
std 366910.96 159.25 26881.85 5.07 13.75 301487.62 1.819508e+07 5.11 13.76
min 0.00 1.00 1257.00 20.03 -165.67 23.00 1.325376e+09 19.03 -166.67
25% 231549.00 9.64 26237.00 34.67 -96.80 741.00 1.343017e+09 34.74 -96.90
50% 463098.00 47.45 48174.00 39.35 -87.48 2443.00 1.357089e+09 39.37 -87.44
75% 833575.75 83.10 72042.00 41.94 -80.16 20328.00 1.374581e+09 41.96 -80.25
max 1296674.00 28948.90 99921.00 66.69 -67.95 2906700.00 1.388534e+09 67.51 -66.95

4.1. General EDA on the data¶

In [14]:
groups = [pd.Grouper(key="transaction_time", freq="1W"), "is_fraud"]
df_ = df.groupby(by=groups).agg({"amount(usd)":'mean',"transaction_id":"count"}).reset_index()
In [15]:
def add_traces(df, x, y,hue, mode, cmap, showlegend=None):
    name_map = {1:"Yes", 0:"No"}
    traces = []
    for flag in df[hue].unique():
        traces.append(
            go.Scatter(
                x=df[df[hue]==flag][x],
                y=df[df[hue]==flag][y],
                mode=mode,
                marker=dict(color=cmap[flag]),
                showlegend=showlegend,
                name=name_map[flag]
            )
        )
    return traces
In [16]:
fig = make_subplots(rows=2, cols=2,
                    specs=[
                        [{}, {}],
                        [{"colspan":2}, None]
                    ],
                    subplot_titles=("Amount(usd) over time", "Number of transactions overtime",
                                    "Number of transaction by amount(usd)")
                   )

ntraces = add_traces(df=df_,x='transaction_time',y='amount(usd)',hue='is_fraud',mode='lines',
                    showlegend=True, cmap=['#61E50F','#D93C1D'])

for trace in ntraces:
    fig.add_trace(
        trace,
        row=1,col=1
    )
    
ntraces = add_traces(df=df_,x='transaction_time',y='transaction_id',hue='is_fraud',mode='lines',
                    showlegend=False, cmap=['#61E50F','#D93C1D'])
for trace in ntraces:
    fig.add_trace(
        trace,
        row=1,col=2
    )

ntraces = add_traces(df=df_,x='transaction_id',y='amount(usd)',hue='is_fraud',mode='markers',
                    showlegend=True, cmap=['#61E50F','#D93C1D'])
for trace in ntraces:
    fig.add_trace(
        trace,
        row=2,col=1
    )

fig.update_layout(height=780,
                  width=960,
                  legend=dict(title='Is fraud?'),
                  plot_bgcolor='#fafafa',
                  title='Overview'
                 )

fig.show()
In [17]:
df_ = df.groupby(by=[pd.Grouper(key="transaction_time", freq="1W"),
                        'is_fraud','category']).agg({"amount(usd)":'mean',"transaction_id":"count"}).reset_index()

fig = px.scatter(df_,
        x='transaction_time',
        y='amount(usd)',
        color='is_fraud',
        facet_col ='category',
        facet_col_wrap=3,
        facet_col_spacing=.04,
        color_discrete_map={0:'#61E50F', 1:'#D93C1D'}
)

fig.update_layout(height=1400,
                        width=960,
                        legend=dict(title='Is fraud?'),
                        plot_bgcolor='#fafafa'
                )

fig.update_yaxes(matches=None)
fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
fig.for_each_xaxis(lambda xaxis: xaxis.update(showticklabels=True, title=''))

fig.show();
In [18]:
df_ = df.groupby(by=[pd.Grouper(key="transaction_time", freq="1M"),
                           'is_fraud','category']).agg({"amount(usd)":'sum',"transaction_id":"count"}).reset_index()

fig = px.area(
    df_[df_.is_fraud==1],
    x='transaction_time',
    y='amount(usd)',
    color='category',
    color_discrete_sequence=px.colors.qualitative.Dark24
)

fig.update_layout(height=600,
                  width=960,
                  legend=dict(title='Categories'),
                  plot_bgcolor='#fafafa'
                 )

fig.show();
In [19]:
# Specified list of 12 merchants with the highest number of transactions.
top12_merchants = df.merchant.value_counts()[:12]

df_ = df.groupby(by=[pd.Grouper(key="transaction_time", freq="1W"),'is_fraud',
                           'merchant']).agg({"amount(usd)":'mean',"transaction_id":"count"}).reset_index()

df_ = df_[df_.merchant.isin(top12_merchants.index)]
In [20]:
fig = px.scatter(df_,
        x='transaction_time',
        y='amount(usd)',
        color='is_fraud',
        facet_col ='merchant',
        facet_col_wrap=3,
        facet_col_spacing=.06,
        category_orders={'merchant': top12_merchants.index}, # order the subplots
        color_discrete_map={1:'#61E50F', 0:'#D93C1D'}
)

fig.update_layout(height=1200,
                  width=960,
                  title='Top 12 merchants with highest number of transactions per week',
                  legend=dict(title='Is fraud?'),
                  plot_bgcolor='#fafafa'
                 )

fig.update_yaxes(matches=None)
fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
fig.for_each_xaxis(lambda xaxis: xaxis.update(showticklabels=True, title=''))

fig.show();

Jobs with the highest fraud transactions

In [21]:
groups = ['is_fraud','job']
df_ = df.groupby(by=groups).agg({"amount(usd)":'mean',"transaction_id":"count"}).fillna(0).reset_index()

# Top 10 jobs had most fraud transactions.
df_ = df_[df_.is_fraud==1].sort_values(by='transaction_id',
                                    ascending=False).drop_duplicates('job', keep='first').iloc[:10, :]
df_
Out[21]:
is_fraud job amount(usd) transaction_id
880 1 Quantity surveyor 611.805652 69
806 1 Naval architect 650.121970 66
784 1 Materials engineer 561.092097 62
539 1 Audiological scientist 662.505172 58
918 1 Senior tax professional/tax inspector 570.492456 57
977 1 Trading standards officer 478.137143 56
843 1 Podiatrist 477.762593 54
691 1 Film/video editor 528.820577 52
589 1 Colour technologist 440.824706 51
685 1 Exhibition designer 524.067255 51
In [22]:
fig = px.bar(df_,
             y='job', x='transaction_id',
             color='amount(usd)',
             color_continuous_scale=px.colors.sequential.Rainbow,
             labels={'job':'Job title', 
                     'transaction_id': 'Number of fraud transactions'},
             category_orders = {"job": df_.job.values},
             width=960,
             height=600)

fig.update_layout(
    title=dict(
        text='Amount(usd) among top 10 jobs with the most fraud transactions'
    ),
    plot_bgcolor='#fafafa'
)

fig.update_coloraxes(
    colorbar=dict(
        title='Amount(usd) of transactions',
        orientation='h',
        x=1
    ),
    reversescale=True
)

fig.show()

credit card holder with most fraud transaction

In [23]:
groups = ['credit_card_number']
df_ = df.groupby(by=groups).agg({"amount(usd)":'mean',"transaction_id":"count"}).fillna(0).reset_index()
df_.sort_values('transaction_id', ascending=False, inplace=True)
df_ = df_.head(10)
df_
Out[23]:
credit_card_number amount(usd) transaction_id
185 30270432095985 56.479135 4392
886 6538441737335434 76.542413 4392
887 6538891242532018 87.509667 4386
703 4364010865167176 47.876443 4386
747 4642255475285942 59.124403 4386
843 6011438889172900 91.422839 4385
332 344709867813900 89.378027 4385
787 4904681492230012 60.779008 4384
737 4586810168620942 72.951437 4384
135 4745996322265 75.752662 4384
In [ ]:
 
In [24]:
df_ = df[df.is_fraud==1].groupby(by='hour_of_day').agg({'transaction_id':'count'}).reset_index()

fig = px.bar(data_frame=df_,
       x='hour_of_day',
       y='transaction_id',
       labels={'transaction_id':'Number of transaction'})

fig.update_layout(
    title=dict(
        text='Number of FRAUD transactions by hours of day'
    ),
    plot_bgcolor='#ED2B2A'
)

fig.update_xaxes(type='category')

Feature co-relation on the train data¶

In [25]:
fig = plt.figure(figsize=(18,9))
mask = np.triu(np.ones_like(df.corr()))
sns.heatmap(df.corr(), mask=mask, cmap='coolwarm', annot=True)
C:\Users\ndaru\AppData\Local\Temp\ipykernel_1380\2504256764.py:2: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

C:\Users\ndaru\AppData\Local\Temp\ipykernel_1380\2504256764.py:3: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

Out[25]:
<Axes: >
In [26]:
features = ['transaction_id', 'hour_of_day', 'category', 'amount(usd)', 'merchant', 'job']

#
X = df[features].set_index("transaction_id")
y = df['is_fraud']

print('X shape:{}\ny shape:{}'.format(X.shape,y.shape))
X shape:(1852394, 5)
y shape:(1852394,)
In [27]:
# Encoding categorical data
enc = OrdinalEncoder(dtype=np.int64)
enc.fit(X.loc[:,['category','merchant','job']])

X.loc[:, ['category','merchant','job']] = enc.transform(X[['category','merchant','job']])
C:\Users\ndaru\AppData\Local\Temp\ipykernel_1380\4109020072.py:5: DeprecationWarning:

In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`

In [28]:
X['hour_of_day'] = X['hour_of_day'].astype(int)
X.head()
Out[28]:
hour_of_day category amount(usd) merchant job
transaction_id
0b242abb623afc578575680df30655b9 0 8 4.97 514 372
1f76529f8574734946361c461b024d99 0 4 107.23 241 431
a1a22d70485983eac12b5b88dad1cf95 0 0 220.11 390 308
6b849c168bdad6f867558c3793159a81 0 2 45.00 360 330
a41d7549acf90789359a9aa5346dcb46 0 9 41.96 297 116
In [29]:
X.info()
<class 'pandas.core.frame.DataFrame'>
Index: 1852394 entries, 0b242abb623afc578575680df30655b9 to 1765bb45b3aa3224b4cdcb6e7a96cee3
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   hour_of_day  int32  
 1   category     int64  
 2   amount(usd)  float64
 3   merchant     int64  
 4   job          int64  
dtypes: float64(1), int32(1), int64(3)
memory usage: 77.7+ MB
In [30]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))
Training set size: 1481915
Testing set size: 370479

Angle Based Outlier Detection¶

In [31]:
from abod import ABOD
In [32]:
# Create an instance of ABOD model
model = ABOD()
model.fit(X_train)
Out[32]:
ABOD(contamination=0.1, method='fast', n_neighbors=5)
In [33]:
# Predict the anomaly scores and plot them
scores = model.decision_scores_

# plt.figure(figsize=(6,5))
# plt.plot(scores)
# plt.title('Training Curve')
# plt.xlabel('Data Point')
# plt.ylabel('Outlier Score')
# plt.show()
In [34]:
# Identify the outliers based on a threshold
threshold_ABOD = np.percentile(scores, 90)
anomalies_ABOD = np.where(scores > threshold_ABOD)[0]

Model evaluation¶

In [35]:
print("Number of anomalies in ABOD:", len(anomalies_ABOD))
print("Indices of anomalies:", anomalies_ABOD)
Number of anomalies in ABOD: 148192
Indices of anomalies: [      9      14      27 ... 1481879 1481885 1481892]
In [36]:
y_pred = model.predict(X_test)
In [37]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}")
Precision: 0.042, Recall: 0.802, F1-score: 0.080
In [38]:
clfr = classification_report(y_test, y_pred)
print("classification report:\n", clfr)
classification report:
               precision    recall  f1-score   support

           0       1.00      0.90      0.95    368526
           1       0.04      0.80      0.08      1953

    accuracy                           0.90    370479
   macro avg       0.52      0.85      0.51    370479
weighted avg       0.99      0.90      0.94    370479

In [39]:
# accuracy of the model
accuracy_ABOD = accuracy_score(y_test, y_pred)
print("Accuracy of the model:", (100 - accuracy_ABOD))
Accuracy of the model: 99.09692317243352

Autoencoders¶

In [40]:
from sklearn.preprocessing import StandardScaler
from keras.layers import Input, Dense
from keras.models import Model
In [41]:
# Normalize the data
scaler = StandardScaler()
X_norm = scaler.fit_transform(X_train)

Define the autoencoder architecture¶

In [42]:
input_layer = Input(shape=(X_norm.shape[1],))
encoded = Dense(32, activation='relu')(input_layer)
decoded = Dense(X_norm.shape[1], activation='linear')(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')
In [43]:
# Train the autoencoder
history = autoencoder.fit(X_norm, X_norm, epochs=100, batch_size=32, validation_data=(X_test, X_test))

# Plot the loss values
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Autoencoder Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
Epoch 1/100
46310/46310 [==============================] - 101s 2ms/step - loss: 0.0027 - val_loss: 798.8865
Epoch 2/100
46310/46310 [==============================] - 101s 2ms/step - loss: 8.2566e-05 - val_loss: 812.2175
Epoch 3/100
46310/46310 [==============================] - 91s 2ms/step - loss: 1.3715e-04 - val_loss: 799.1724
Epoch 4/100
46310/46310 [==============================] - 87s 2ms/step - loss: 7.1967e-05 - val_loss: 785.7149
Epoch 5/100
46310/46310 [==============================] - 93s 2ms/step - loss: 2.5809e-05 - val_loss: 745.5618
Epoch 6/100
46310/46310 [==============================] - 90s 2ms/step - loss: 5.8646e-05 - val_loss: 665.4046
Epoch 7/100
46310/46310 [==============================] - 90s 2ms/step - loss: 2.4789e-05 - val_loss: 683.3556
Epoch 8/100
46310/46310 [==============================] - 86s 2ms/step - loss: 2.0018e-05 - val_loss: 795.3459
Epoch 9/100
46310/46310 [==============================] - 97s 2ms/step - loss: 4.9162e-05 - val_loss: 737.0353
Epoch 10/100
46310/46310 [==============================] - 93s 2ms/step - loss: 3.3235e-05 - val_loss: 597.7823
Epoch 11/100
46310/46310 [==============================] - 98s 2ms/step - loss: 5.2630e-05 - val_loss: 488.4552
Epoch 12/100
46310/46310 [==============================] - 1697s 37ms/step - loss: 1.8584e-05 - val_loss: 474.4684
Epoch 13/100
46310/46310 [==============================] - 95s 2ms/step - loss: 1.8993e-05 - val_loss: 416.3220
Epoch 14/100
46310/46310 [==============================] - 101s 2ms/step - loss: 3.0021e-05 - val_loss: 371.9894
Epoch 15/100
46310/46310 [==============================] - 98s 2ms/step - loss: 2.6584e-05 - val_loss: 387.7877
Epoch 16/100
46310/46310 [==============================] - 101s 2ms/step - loss: 1.3983e-04 - val_loss: 542.9305
Epoch 17/100
46310/46310 [==============================] - 95s 2ms/step - loss: 9.8662e-06 - val_loss: 442.0613
Epoch 18/100
46310/46310 [==============================] - 101s 2ms/step - loss: 1.0518e-05 - val_loss: 483.0312
Epoch 19/100
46310/46310 [==============================] - 100s 2ms/step - loss: 1.3891e-05 - val_loss: 435.2597
Epoch 20/100
46310/46310 [==============================] - 97s 2ms/step - loss: 1.2011e-05 - val_loss: 385.5776
Epoch 21/100
46310/46310 [==============================] - 97s 2ms/step - loss: 1.3489e-05 - val_loss: 346.0983
Epoch 22/100
46310/46310 [==============================] - 87s 2ms/step - loss: 6.6764e-06 - val_loss: 297.7719
Epoch 23/100
46310/46310 [==============================] - 86s 2ms/step - loss: 7.1062e-06 - val_loss: 237.6304
Epoch 24/100
46310/46310 [==============================] - 91s 2ms/step - loss: 6.9782e-06 - val_loss: 226.9504
Epoch 25/100
46310/46310 [==============================] - 87s 2ms/step - loss: 1.0896e-05 - val_loss: 178.5828
Epoch 26/100
46310/46310 [==============================] - 91s 2ms/step - loss: 8.5577e-06 - val_loss: 115.1383
Epoch 27/100
46310/46310 [==============================] - 87s 2ms/step - loss: 5.4364e-06 - val_loss: 81.3829
Epoch 28/100
46310/46310 [==============================] - 99s 2ms/step - loss: 1.4315e-05 - val_loss: 67.3545
Epoch 29/100
46310/46310 [==============================] - 108s 2ms/step - loss: 6.7536e-06 - val_loss: 70.9670
Epoch 30/100
46310/46310 [==============================] - 168s 4ms/step - loss: 1.0562e-05 - val_loss: 67.4418
Epoch 31/100
46310/46310 [==============================] - 105s 2ms/step - loss: 6.4250e-06 - val_loss: 67.4423
Epoch 32/100
46310/46310 [==============================] - 100s 2ms/step - loss: 1.3594e-05 - val_loss: 67.4837
Epoch 33/100
46310/46310 [==============================] - 108s 2ms/step - loss: 6.0162e-06 - val_loss: 67.2489
Epoch 34/100
46310/46310 [==============================] - 108s 2ms/step - loss: 7.6263e-06 - val_loss: 67.4615
Epoch 35/100
46310/46310 [==============================] - 101s 2ms/step - loss: 7.7855e-06 - val_loss: 73.2368
Epoch 36/100
46310/46310 [==============================] - 106s 2ms/step - loss: 5.8810e-06 - val_loss: 67.7864
Epoch 37/100
46310/46310 [==============================] - 100s 2ms/step - loss: 9.4593e-06 - val_loss: 67.4481
Epoch 38/100
46310/46310 [==============================] - 107s 2ms/step - loss: 6.6285e-06 - val_loss: 65.3129
Epoch 39/100
46310/46310 [==============================] - 100s 2ms/step - loss: 3.3031e-06 - val_loss: 67.4063
Epoch 40/100
46310/46310 [==============================] - 106s 2ms/step - loss: 3.6143e-06 - val_loss: 67.9348
Epoch 41/100
46310/46310 [==============================] - 101s 2ms/step - loss: 8.0979e-06 - val_loss: 69.3459
Epoch 42/100
46310/46310 [==============================] - 110s 2ms/step - loss: 6.5660e-06 - val_loss: 67.2906
Epoch 43/100
46310/46310 [==============================] - 100s 2ms/step - loss: 1.0220e-05 - val_loss: 67.5379
Epoch 44/100
46310/46310 [==============================] - 105s 2ms/step - loss: 5.8966e-06 - val_loss: 67.3803
Epoch 45/100
46310/46310 [==============================] - 104s 2ms/step - loss: 3.5860e-06 - val_loss: 66.1019
Epoch 46/100
46310/46310 [==============================] - 100s 2ms/step - loss: 5.3923e-06 - val_loss: 67.4551
Epoch 47/100
46310/46310 [==============================] - 113s 2ms/step - loss: 6.4108e-06 - val_loss: 67.4115
Epoch 48/100
46310/46310 [==============================] - 100s 2ms/step - loss: 4.6064e-06 - val_loss: 70.1752
Epoch 49/100
46310/46310 [==============================] - 105s 2ms/step - loss: 5.0708e-06 - val_loss: 79.5460
Epoch 50/100
46310/46310 [==============================] - 101s 2ms/step - loss: 5.4105e-06 - val_loss: 75.7583
Epoch 51/100
46310/46310 [==============================] - 107s 2ms/step - loss: 3.7249e-06 - val_loss: 64.4269
Epoch 52/100
46310/46310 [==============================] - 100s 2ms/step - loss: 3.1741e-06 - val_loss: 70.8249
Epoch 53/100
46310/46310 [==============================] - 103s 2ms/step - loss: 1.4301e-05 - val_loss: 73.1375
Epoch 54/100
46310/46310 [==============================] - 98s 2ms/step - loss: 1.0320e-05 - val_loss: 67.3938
Epoch 55/100
46310/46310 [==============================] - 105s 2ms/step - loss: 3.0510e-06 - val_loss: 67.5344
Epoch 56/100
46310/46310 [==============================] - 98s 2ms/step - loss: 4.2315e-06 - val_loss: 66.9729
Epoch 57/100
46310/46310 [==============================] - 103s 2ms/step - loss: 3.6442e-06 - val_loss: 67.4425
Epoch 58/100
46310/46310 [==============================] - 102s 2ms/step - loss: 4.3597e-06 - val_loss: 67.4446
Epoch 59/100
46310/46310 [==============================] - 99s 2ms/step - loss: 1.7322e-05 - val_loss: 76.4163
Epoch 60/100
46310/46310 [==============================] - 104s 2ms/step - loss: 5.4840e-06 - val_loss: 67.4960
Epoch 61/100
46310/46310 [==============================] - 97s 2ms/step - loss: 6.6607e-06 - val_loss: 69.4176
Epoch 62/100
46310/46310 [==============================] - 103s 2ms/step - loss: 5.0113e-06 - val_loss: 69.9497
Epoch 63/100
46310/46310 [==============================] - 98s 2ms/step - loss: 7.2803e-06 - val_loss: 67.7120
Epoch 64/100
46310/46310 [==============================] - 103s 2ms/step - loss: 4.8563e-06 - val_loss: 67.4497
Epoch 65/100
46310/46310 [==============================] - 100s 2ms/step - loss: 2.9979e-06 - val_loss: 67.0462
Epoch 66/100
46310/46310 [==============================] - 104s 2ms/step - loss: 4.0548e-06 - val_loss: 67.6371
Epoch 67/100
46310/46310 [==============================] - 98s 2ms/step - loss: 5.9152e-06 - val_loss: 67.4406
Epoch 68/100
46310/46310 [==============================] - 106s 2ms/step - loss: 5.5672e-06 - val_loss: 67.4671
Epoch 69/100
46310/46310 [==============================] - 99s 2ms/step - loss: 3.8862e-06 - val_loss: 66.8664
Epoch 70/100
46310/46310 [==============================] - 104s 2ms/step - loss: 6.1208e-06 - val_loss: 67.0503
Epoch 71/100
46310/46310 [==============================] - 98s 2ms/step - loss: 7.0417e-06 - val_loss: 68.0381
Epoch 72/100
46310/46310 [==============================] - 105s 2ms/step - loss: 4.3939e-06 - val_loss: 67.5628
Epoch 73/100
46310/46310 [==============================] - 100s 2ms/step - loss: 3.7445e-06 - val_loss: 68.0660
Epoch 74/100
46310/46310 [==============================] - 95s 2ms/step - loss: 7.0234e-06 - val_loss: 67.4700
Epoch 75/100
46310/46310 [==============================] - 100s 2ms/step - loss: 5.9367e-06 - val_loss: 70.2799
Epoch 76/100
46310/46310 [==============================] - 93s 2ms/step - loss: 3.0918e-06 - val_loss: 75.9240
Epoch 77/100
46310/46310 [==============================] - 101s 2ms/step - loss: 4.1638e-06 - val_loss: 80.6550
Epoch 78/100
46310/46310 [==============================] - 96s 2ms/step - loss: 4.1391e-06 - val_loss: 86.4364
Epoch 79/100
46310/46310 [==============================] - 100s 2ms/step - loss: 4.6143e-06 - val_loss: 69.2294
Epoch 80/100
46310/46310 [==============================] - 95s 2ms/step - loss: 7.4677e-06 - val_loss: 67.4877
Epoch 81/100
46310/46310 [==============================] - 100s 2ms/step - loss: 1.2753e-05 - val_loss: 68.0223
Epoch 82/100
46310/46310 [==============================] - 94s 2ms/step - loss: 4.6475e-06 - val_loss: 67.3934
Epoch 83/100
46310/46310 [==============================] - 107s 2ms/step - loss: 4.8149e-06 - val_loss: 68.9184
Epoch 84/100
46310/46310 [==============================] - 96s 2ms/step - loss: 4.4906e-06 - val_loss: 69.5349
Epoch 85/100
46310/46310 [==============================] - 100s 2ms/step - loss: 8.2413e-06 - val_loss: 86.2081
Epoch 86/100
46310/46310 [==============================] - 96s 2ms/step - loss: 7.1900e-06 - val_loss: 76.4586
Epoch 87/100
46310/46310 [==============================] - 102s 2ms/step - loss: 5.0467e-06 - val_loss: 95.7874
Epoch 88/100
46310/46310 [==============================] - 95s 2ms/step - loss: 5.5555e-06 - val_loss: 86.1176
Epoch 89/100
46310/46310 [==============================] - 101s 2ms/step - loss: 4.8585e-06 - val_loss: 68.7187
Epoch 90/100
46310/46310 [==============================] - 97s 2ms/step - loss: 4.9291e-06 - val_loss: 71.0593
Epoch 91/100
46310/46310 [==============================] - 101s 2ms/step - loss: 3.6411e-06 - val_loss: 72.2039
Epoch 92/100
46310/46310 [==============================] - 97s 2ms/step - loss: 6.3930e-06 - val_loss: 67.8203
Epoch 93/100
46310/46310 [==============================] - 100s 2ms/step - loss: 5.2665e-06 - val_loss: 67.4419
Epoch 94/100
46310/46310 [==============================] - 95s 2ms/step - loss: 4.1132e-06 - val_loss: 62.6383
Epoch 95/100
46310/46310 [==============================] - 101s 2ms/step - loss: 6.5242e-06 - val_loss: 68.2449
Epoch 96/100
46310/46310 [==============================] - 97s 2ms/step - loss: 4.3387e-06 - val_loss: 67.4390
Epoch 97/100
46310/46310 [==============================] - 100s 2ms/step - loss: 5.6844e-06 - val_loss: 68.6650
Epoch 98/100
46310/46310 [==============================] - 96s 2ms/step - loss: 4.1366e-06 - val_loss: 67.5588
Epoch 99/100
46310/46310 [==============================] - 109s 2ms/step - loss: 4.8993e-06 - val_loss: 67.7765
Epoch 100/100
46310/46310 [==============================] - 104s 2ms/step - loss: 4.1623e-06 - val_loss: 67.3544
In [44]:
# Predict the reconstructed X_test
X_test_pred = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - X_test_pred, 2), axis=1)

# threshold to classify as fraud or not fraud
threshold = np.mean(mse) + 3*np.std(mse)

# Create the predicted labels based on the threshold
y_pred = np.where(mse > threshold, 1, 0)
11578/11578 [==============================] - 16s 1ms/step
In [45]:
plt.figure(figsize=(10, 7))
plt.plot(X_norm[0], label='Original Data')
plt.plot(X_test_pred[0], label='Reconstructed Data')
plt.legend()
plt.title('Original vs Reconstructed Data')
plt.show()
In [46]:
# Create the confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

# Print the confusion matrix and classification report
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)
C:\Users\ndaru\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

Confusion Matrix:
[[368526      0]
 [  1953      0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    368526
           1       0.00      0.00      0.00      1953

    accuracy                           0.99    370479
   macro avg       0.50      0.50      0.50    370479
weighted avg       0.99      0.99      0.99    370479

C:\Users\ndaru\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

C:\Users\ndaru\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

In [47]:
# Plot the confusion matrix
plt.figure(figsize=(6,5))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.colorbar()
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks([0,1])
plt.yticks([0,1])
plt.show()

Evaluate Performance¶

In [48]:
# Evaluate the performance of the autoencoder
mse = autoencoder.evaluate(X_norm, X_norm, verbose=0)
rmse = np.sqrt(mse)

print("MSE: {:.8f}".format(mse))
print("RMSE: {:.8f}".format(rmse))
MSE: 0.00000003
RMSE: 0.00016254
In [49]:
# Calculate the accuracy
accuracy_AE = accuracy_score(y_test, y_pred)
print("Fraud detection accuracy (AE): {:.2f}%".format(accuracy_AE * 100))
Fraud detection accuracy (AE): 99.47%
In [ ]:
 
In [ ]: